package pub_lib

import (
	"bytes"
	"fmt"
	"regexp"
	"sort"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"golang.org/x/net/html"
	"publish/pub-lib/baijia"
)

// 通用提取 文章信息 feed流
func ParseFeed(text string) []baijia.Feed {
	text = formatHtml(text)
	// text = RemoveHtml(text, "span", "strong")
	reg := regexp.MustCompile(`<(p|P)[^>]*>([^<]*?)</[p|P]>`)
	feeds := baijia.Feeds{}
	elems := reg.FindAllStringSubmatch(text, -1)
	elemIndexs := reg.FindAllStringSubmatchIndex(text, -1)
	for index, elem := range elems {
		feeds = append(feeds, baijia.Feed{Type: "text", Data: elem[2], Index: elemIndexs[index][0]})
	}

	reg = regexp.MustCompile(`<img.+?src="([^"]+)".+?(/?)>`)
	elems = reg.FindAllStringSubmatch(text, -1)
	elemIndexs = reg.FindAllStringSubmatchIndex(text, -1)
	for index, elem := range elems {
		src := elem[1]
		if strings.HasPrefix(src, "//") {
			src = "http:" + src
		}
		if !strings.HasPrefix(src, "http://") && !strings.HasPrefix(src, "https://") {
			src = "http://" + src
		}

		feeds = append(feeds, baijia.Feed{Type: "image", Data: src, Index: elemIndexs[index][0]})
	}
	sort.Sort(feeds)

	return feeds
}

func formatHtml(text string) string {
	var buf bytes.Buffer
	sel, _ := goquery.NewDocumentFromReader(strings.NewReader(text))
	for _, n := range sel.Nodes {
		buf.WriteString(exatraNodeText(n))
	}
	rt := buf.String()
	rt_array := strings.Split(rt, "\n")
	res := ""
	for _, v := range rt_array {
		v = strings.Trim(v, " 　")
		v = strings.TrimSpace(v)
		if len(v) > 0 {
			//if strings.HasPrefix(v, "本文来源") {
			//	continue
			//}
			//if strings.Contains(v, "未经允许不得转载") {
			//	continue
			//}
			//if strings.Contains(v, "不代表今日头条立场") {
			//	continue
			//}
			res += fmt.Sprintf("<p>%s</p>", v)
		}
	}
	return res
}

func exatraNodeText(node *html.Node) string {
	doc := goquery.NewDocumentFromNode(node)
	if node.Type == html.TextNode {
		// Keep newlines and spaces, like jQuery
		text := strings.Trim(node.Data, " \r\n")
		if len(text) > 0 {
			return text
		}
		return ""
	} else if node.DataAtom.String() == "img" ||
		node.DataAtom.String() == "embed" || doc.HasClass("video_iframe") ||
		node.DataAtom.String() == "video" || doc.HasClass("tt-video-box") {
		doc.RemoveAttr("data-type")
		doc.RemoveAttr("data-w")
		doc.RemoveAttr("data-ratio")
		doc.RemoveAttr("data-s")
		doc.RemoveAttr("data-width")
		doc.RemoveAttr("onload")
		doc.RemoveAttr("onclick")
		doc.RemoveAttr("style")
		if v, ok := doc.Attr("data-src"); ok {
			doc.SetAttr("src", v)
		}
		doc.RemoveAttr("data-src")
		doc.RemoveAttr("data-src")
		return "\n" + outerhtml(doc.Selection) + "\n"
		//return fmt.Sprintf("<p>%s</p>", outerhtml(doc.Selection))
	} else if node.DataAtom.String() == "script" {
		if v, ok := doc.Attr("src"); ok {
			if strings.Contains(v, "tt_player/tt.player.js") {
				return "\n" + outerhtml(doc.Selection) + "\n"
			}
		}
		return ""
	} else if node.FirstChild != nil {
		if node.DataAtom.String() == "script" {
			return ""
		}
		var buf bytes.Buffer
		for c := node.FirstChild; c != nil; c = c.NextSibling {
			buf.WriteString(exatraNodeText(c))
		}
		break_node := []string{"p", "h1", "h2", "h3", "h4", "h5", "h6"}
		for _, b := range break_node {
			if node.DataAtom.String() == b {
				buf.WriteString("\n")
			}
		}
		return buf.String()
	}

	return ""
}

func outerhtml(s *goquery.Selection) string {
	if s.Length() <= 0 {
		return ""
	}
	var buf bytes.Buffer
	html.Render(&buf, s.Nodes[0])
	return html.UnescapeString(buf.String())
}
